#install.packages("datasets")
#check the data by writing state.x77
Income = state.x77[,"Income"]
Pop = state.x77[,"Population"]
Area = state.x77[,"Area"]
Illit = state.x77[,"Illiteracy"]
Murder = state.x77[,"Murder"]
fit=lm(Income~Pop+Area+Illit+Murder)
summary(fit)
##
## Call:
## lm(formula = Income ~ Pop + Area + Illit + Murder)
##
## Residuals:
## Min 1Q Median 3Q Max
## -795.8 -336.4 -105.5 316.6 1121.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.700e+03 1.700e+02 27.654 < 2e-16 ***
## Pop 4.020e-02 1.703e-02 2.361 0.02263 *
## Area 3.032e-03 8.481e-04 3.575 0.00085 ***
## Illit -4.009e+02 1.656e+02 -2.421 0.01957 *
## Murder -2.448e+01 2.969e+01 -0.824 0.41406
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 487.7 on 45 degrees of freedom
## Multiple R-squared: 0.4214, Adjusted R-squared: 0.37
## F-statistic: 8.194 on 4 and 45 DF, p-value: 4.715e-05
coefficients(fit) # this is betahat
## (Intercept) Pop Area Illit Murder
## 4.700257e+03 4.019908e-02 3.032067e-03 -4.008758e+02 -2.447963e+01
RSS<-sum(residuals(fit)**2) #thiss SSR
sigmasquare<-RSS/(length(residuals(fit))-4) #S2 in a. That is estimate of sigmasquare
#b starts
r<-fit$residuals
plot(r,Area) #b # we plotted
hist(residuals(fit)) #b #histogram of residuals
mean(r)
## [1] 1.054781e-15
mean(residuals(fit))
## [1] 1.054781e-15
#c
logArea=log(Area)
fitc=lm(Income~Pop+logArea+Illit+Murder) #this is refined model
summary(fitc)
##
## Call:
## lm(formula = Income ~ Pop + logArea + Illit + Murder)
##
## Residuals:
## Min 1Q Median 3Q Max
## -837.33 -364.14 -29.64 265.45 2225.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5136.49137 777.88348 6.603 3.96e-08 ***
## Pop 0.03484 0.01919 1.816 0.0761 .
## logArea -30.20394 73.93083 -0.409 0.6848
## Illit -504.10599 193.89257 -2.600 0.0126 *
## Murder 8.56928 35.12602 0.244 0.8084
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 551.6 on 45 degrees of freedom
## Multiple R-squared: 0.2599, Adjusted R-squared: 0.1941
## F-statistic: 3.95 on 4 and 45 DF, p-value: 0.007855
RSSc<-sum(residuals(fitc)**2)
#means parameters in front of murder and logarea zero. i.e. betas are zero in fron of them.
#new model #
fitc2=lm(Income~ Pop +Illit)
summary(fitc2)
##
## Call:
## lm(formula = Income ~ Pop + Illit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -848.72 -349.42 -60.84 294.78 2171.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4833.16224 176.61055 27.366 < 2e-16 ***
## Pop 0.03555 0.01741 2.042 0.046780 *
## Illit -468.63466 127.49422 -3.676 0.000608 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 540.8 on 47 degrees of freedom
## Multiple R-squared: 0.257, Adjusted R-squared: 0.2253
## F-statistic: 8.127 on 2 and 47 DF, p-value: 0.0009307
RSSc2<-sum(residuals(fitc2)**2)
RSSc2
## [1] 13747064
anova(fitc2)
## Analysis of Variance Table
##
## Response: Income
## Df Sum Sq Mean Sq F value Pr(>F)
## Pop 1 802184 802184 2.7426 0.1043685
## Illit 1 3951845 3951845 13.5110 0.0006075 ***
## Residuals 47 13747064 292491
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fc<-(RSSc2-RSSc)/RSSc*(45/2)
fc_value=1-pf(fc,2,45)
fc_value
## [1] 0.9160031
#d
Illit2=Illit^2
refit3<-lm(Income~ Pop +Illit+Illit2)
summary(refit3)
##
## Call:
## lm(formula = Income ~ Pop + Illit + Illit2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -864.42 -363.82 18.94 231.73 1915.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4091.32194 376.30857 10.872 2.66e-14 ***
## Pop 0.02364 0.01758 1.345 0.1852
## Illit 931.78009 645.73570 1.443 0.1558
## Illit2 -488.22974 221.03369 -2.209 0.0322 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 519.8 on 46 degrees of freedom
## Multiple R-squared: 0.3282, Adjusted R-squared: 0.2844
## F-statistic: 7.491 on 3 and 46 DF, p-value: 0.0003485
anova(refit3)
## Analysis of Variance Table
##
## Response: Income
## Df Sum Sq Mean Sq F value Pr(>F)
## Pop 1 802184 802184 2.9689 0.0915941 .
## Illit 1 3951845 3951845 14.6261 0.0003929 ***
## Illit2 1 1318265 1318265 4.8790 0.0322058 *
## Residuals 46 12428798 270191
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RSS4<-sum(residuals(refit3)**2)
RSS4
## [1] 12428798
f_2<-((RSSc2-RSS4)/RSS4)*(46/1)
f_value_2=1-pf(f_2,1,46) # so reject the null hypothesis that means illet2 is significant.
f_value_2
## [1] 0.03220576
#e
Z<-Income-0.05*Pop+500*Illit
fit5<-lm(Z~1)
summary(fit5)
##
## Call:
## lm(formula = Z ~ 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -817.38 -374.70 -93.48 233.46 2238.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4808.48 75.47 63.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 533.7 on 49 degrees of freedom
anova(fit5)
## Analysis of Variance Table
##
## Response: Z
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 49 13956021 284817
RSS5<-sum(residuals(fit5)**2)
RSS5
## [1] 13956021
f_value_e<-(RSS5-RSSc2)/RSSc2*(47/2)
p_value_e<-1-pf(f_value_e,2,47)
print(p_value_e)
## [1] 0.701513